{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "4f991098-08b9-4e8b-912e-ce79879908b9",
   "metadata": {},
   "source": [
    "### Structural variant scores, constraint and conservation \n",
    "\n",
    "* CADD-SV \n",
    "* PhyloP \n",
    "* PhastCons conserved elements \n",
    "* HARs\n",
    "* gnomAD constraint \n",
    "* gwRVIS\n",
    "* Excluded JARVIS\n",
    "\n",
    "---\n",
    "* Included MSC SV types - severely underpowered to detect significant differences \n",
    "* Exclued non-coding vs coding - currently annotating exons from protein-coding genes (so could be non-coding)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "80713c51-2a86-46cb-8c73-8ff8d022654a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np\n",
    "from collections import Counter\n",
    "import sys \n",
    "from patsy import dmatrices\n",
    "from pathlib import Path\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from statsmodels.discrete import discrete_model\n",
    "from pybedtools import BedTool\n",
    "from io import StringIO\n",
    "from functools import reduce\n",
    "from statsmodels.stats import multitest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c8adb49c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# inputs \n",
    "wkdir = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/\"\n",
    "wkdir_path = Path(wkdir)\n",
    "\n",
    "# control and misexpressed SVs \n",
    "all_vrnts_path = wkdir_path.joinpath(\"5_misexp_vrnts/test_cntrl_sets/misexp_cntrl_final/vrnt_id_in_window_cntrl_misexp_genes.txt\")\n",
    "all_vrnts_bed_path = wkdir_path.joinpath(\"5_misexp_vrnts/test_cntrl_sets/misexp_cntrl_final/vrnt_id_in_windows_misexp_genes.bed\")\n",
    "misexp_vrnts_path = wkdir_path.joinpath(\"5_misexp_vrnts/test_cntrl_sets/misexp_cntrl_final/vrnt_id_misexp_tpm_zscore_median.txt\")\n",
    "\n",
    "sv_info_path = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/data/sv_vcf/info_table/final_sites_critical_info_allele.txt\"\n",
    "# score paths \n",
    "cadd_sv_score_info_path = f\"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression/5_functional_analysis/sv/data/cadd_sv/interval_svs/intrvl_all_svs/bed_out/merged/intrvl_svs_no_inv_121042_cadd_sv_info.tsv\"\n",
    "phylop_dir = wkdir_path.joinpath(\"5_misexp_vrnts/sv_scores/phylop\")\n",
    "gwrvis_dir = wkdir_path.joinpath(\"5_misexp_vrnts/sv_scores/gwrvis/v2\")\n",
    "#jarvis_dir = wkdir_path.joinpath(\"5_misexp_vrnts/sv_scores/jarvis\")\n",
    "gnomad_constraint_path = wkdir_path.joinpath(\"reference/gnomad/constraint_z_genome_1kb.qc.download.txt.clean\")\n",
    "gerp_elements_path = wkdir_path.joinpath(\"reference/conservation/gerp/gerpElements_hg38_multiz120Mammals.bed\")\n",
    "phastcons_elements_path = wkdir_path.joinpath(\"reference/conservation/phastcons/phastConsElements_hg38_multiz120Mammals.bed\")\n",
    "hars_bed_path = wkdir_path.joinpath(\"reference/conservation/hars/hg38_adc_hars.clean.bed\")\n",
    "fantom_5enh_path = wkdir_path.joinpath(\"reference/conservation/fantom5_enh/hg38_conserved10mer_mammal_FANTOMenhancer.clean.bed\")\n",
    "# output directory\n",
    "out_dir = wkdir_path.joinpath(\"5_misexp_vrnts/sv_scores/results/misexp_sv_zscore_final\")\n",
    "out_dir.mkdir(parents=True, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "56042387",
   "metadata": {},
   "outputs": [],
   "source": [
    "# constants \n",
    "CHROMOSOMES = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6',\n",
    "               'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12',\n",
    "               'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18',\n",
    "               'chr19', 'chr20', 'chr21', 'chr22']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "467bf3d8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of variants: 20262\n",
      "Number of misexpression-associated SVs: 105\n"
     ]
    }
   ],
   "source": [
    "# load bed file with variants in windows \n",
    "vrnts_in_windows_bed = BedTool(all_vrnts_bed_path)\n",
    "vrnts_in_windows_bed_sorted = vrnts_in_windows_bed.sort()\n",
    "# all variants in windows df\n",
    "all_vrnts_in_windows_df = pd.read_csv(all_vrnts_path, sep=\"\\t\", header=None).rename(columns={0:\"vrnt_id\"})\n",
    "print(f\"Total number of variants: {len(all_vrnts_in_windows_df.vrnt_id.unique())}\")\n",
    "\n",
    "# load misexpression-associated variants \n",
    "misexp_vrnts_ids = pd.read_csv(misexp_vrnts_path, sep=\"\\t\", header=None)[0].astype(str).unique()\n",
    "print(f\"Number of misexpression-associated SVs: {len(misexp_vrnts_ids)}\")\n",
    "all_vrnts_in_windows_df[\"misexp_uniq\"] = np.where(all_vrnts_in_windows_df.vrnt_id.isin(misexp_vrnts_ids), 1, 0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5c3d2fc1-c28a-408f-ac1d-e9af8e739f52",
   "metadata": {},
   "source": [
    "### CADD-SV "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a7084317-5699-4069-8300-bc64e1f37779",
   "metadata": {},
   "outputs": [],
   "source": [
    "vrnts_cadd_sv_scores_df = pd.read_csv(cadd_sv_score_info_path, sep=\"\\t\", dtype={\"plinkID\": str}).rename(columns={\"plinkID\":\"vrnt_id\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "fee77178-2fec-440b-a22d-be6c57aaabd7",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# no INV or MEI annotations \n",
    "vrnts_cadd_sv_df = all_vrnts_in_windows_df.copy()\n",
    "vrnts_cadd_sv_df = pd.merge(vrnts_cadd_sv_df, \n",
    "                             vrnts_cadd_sv_scores_df[[\"vrnt_id\", \"Raw-Score-combined\"]], \n",
    "                             how=\"left\", \n",
    "                             on=\"vrnt_id\")\n",
    "vrnts_cadd_sv_df = vrnts_cadd_sv_df.rename(columns={\"Raw-Score-combined\": \"CADD_sv_raw_score\"})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "879965f8-75fb-4056-80cb-d6f66062cd5d",
   "metadata": {},
   "source": [
    "### Conservation (PhyloP)\n",
    "\n",
    "* Max approach \n",
    "* Phastcons score seems to max at 1.0, perhaps just PhyloP - larger range of values "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "8af41365-fb7e-4bfc-b48c-50c953d450b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "### phylop - missing some values \n",
    "vrnt_phylop_df = all_vrnts_in_windows_df.copy()\n",
    "phylop_dir_path = Path(phylop_dir)\n",
    "phylop_score_metrics_list = []\n",
    "for chrom in CHROMOSOMES[:22]: \n",
    "    phylop_score_path = phylop_dir_path.joinpath(f\"{chrom}_sv_in_windows_phylop_mean_median_max.tsv\")\n",
    "    phylop_score_metrics_list.append(pd.read_csv(phylop_score_path, sep=\"\\t\"))\n",
    "vrnt_phylop_scored_full_df = pd.concat(phylop_score_metrics_list)\n",
    "vrnt_phylop_scored_df = vrnt_phylop_scored_full_df[[\"vrnt_id\", \"max\"]].rename(columns={\"max\":\"phylop_max\"})\n",
    "vrnt_phylop_df = pd.merge(vrnt_phylop_df, \n",
    "                          vrnt_phylop_scored_df, \n",
    "                          on=\"vrnt_id\",\n",
    "                          how=\"left\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cddf9d73-ae91-49d5-a080-3b5c7fa2088c",
   "metadata": {},
   "source": [
    "### GnomAD constraint score \n",
    "\n",
    "* Max constraint in window approach \n",
    "* Other potential options: \n",
    "    * Weighted sum approach: overlap x constraint  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "d3478309-f8e7-4e85-936f-766794fd169d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load GnomAD constraint scores \n",
    "gnomad_constraint = BedTool(gnomad_constraint_path)\n",
    "# intersect with bed file \n",
    "gnomad_bed_intersect_cols = {0:\"vrnt_chrom\", 1:\"vrnt_start\", 2:\"vrnt_end\", 3:\"vrnt_id\", 4:\"window_chrom\", \n",
    "                             5:\"window_start\", 6:\"window_end\", 7:\"window_id\", 8:\"pos\", 9:\"exp\", 10:\"obs\", 11:\"oe\", 12:\"zscore\", 13:\"overlap\"}\n",
    "sv_intersect_gnomad_constraint_str = StringIO(str(vrnts_in_windows_bed_sorted.intersect(gnomad_constraint, wo=True)))\n",
    "sv_intersect_gnomad_constraint_df = pd.read_csv(sv_intersect_gnomad_constraint_str, sep=\"\\t\", header=None).rename(columns=gnomad_bed_intersect_cols).astype({\"vrnt_id\":str})\n",
    "# max constraint score \n",
    "sv_intersect_max_constraint_df = pd.DataFrame(sv_intersect_gnomad_constraint_df.groupby(\"vrnt_id\")[\"zscore\"].max()).reset_index().rename(columns={\"zscore\":f\"gnomad_constraint_max_zscore\"})\n",
    "# around n=5041 are NaNs, observe enrichment of NaNs in misexpression-associated SVs \n",
    "vrnt_gnomad_constraint_df = all_vrnts_in_windows_df.copy()\n",
    "vrnt_gnomad_constraint_df = pd.merge(vrnt_gnomad_constraint_df, \n",
    "                                     sv_intersect_max_constraint_df, \n",
    "                                     how=\"outer\", \n",
    "                                     on=\"vrnt_id\")\n",
    "## add binary term for SV > z-score 4\n",
    "vrnt_gnomad_constraint_df[\"gnomad_max_constraint_grtr_z4_nonan\"] = np.where(vrnt_gnomad_constraint_df.gnomad_constraint_max_zscore > 4, 1, 0)\n",
    "vrnt_gnomad_constraint_df[\"gnomad_max_constraint_grtr_z4\"] = np.where(vrnt_gnomad_constraint_df.gnomad_constraint_max_zscore.isna(), \n",
    "                                                                      np.nan, \n",
    "                                                                     vrnt_gnomad_constraint_df.gnomad_max_constraint_grtr_z4_nonan\n",
    "                                                                     )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "32701e2f-ed3d-430a-9c95-4110e5263250",
   "metadata": {},
   "source": [
    "### Gerp Conserved elements "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "d1273c65-8270-46bc-92b0-d23023c40aae",
   "metadata": {},
   "outputs": [],
   "source": [
    "gerp_elements = BedTool(gerp_elements_path)\n",
    "# intersect with bed file \n",
    "gerp_bed_intersect_cols = {0:\"vrnt_chrom\", 1:\"vrnt_start\", 2:\"vrnt_end\", 3:\"vrnt_id\", 4:\"element_chrom\", \n",
    "                             5:\"element_start\", 6:\"element_end\", 7:\"element_id\", 8:\"overlap\"}\n",
    "sv_intersect_gerp_elements_str = StringIO(str(vrnts_in_windows_bed_sorted.intersect(gerp_elements, wo=True)))\n",
    "sv_intersect_gerp_elements_df = pd.read_csv(sv_intersect_gerp_elements_str, sep=\"\\t\", header=None).rename(columns=gerp_bed_intersect_cols).astype({\"vrnt_id\":str})\n",
    "sv_intersect_gerp_elements = sv_intersect_gerp_elements_df.vrnt_id.unique()\n",
    "# gerp element overlap and count\n",
    "vrnt_gerp_elements_df = all_vrnts_in_windows_df.copy()\n",
    "vrnt_gerp_element_count_df = pd.DataFrame(sv_intersect_gerp_elements_df.groupby(\"vrnt_id\", as_index=False).element_id.count()).rename(columns={\"element_id\":\"gerp_element_count\"})\n",
    "vrnt_gerp_elements_df = pd.merge(vrnt_gerp_elements_df, \n",
    "                                 vrnt_gerp_element_count_df, \n",
    "                                 on=\"vrnt_id\", \n",
    "                                 how=\"left\"\n",
    "                                ).fillna(0)\n",
    "\n",
    "vrnt_gerp_elements_df[\"gerp_element_overlap\"] = np.where(vrnt_gerp_elements_df.vrnt_id.isin(sv_intersect_gerp_elements), 1, 0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f4d058f7",
   "metadata": {},
   "source": [
    "### PhastCons"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "fb469a5c",
   "metadata": {},
   "outputs": [],
   "source": [
    "phastcons_elements = BedTool(phastcons_elements_path)\n",
    "# intersect with bed file \n",
    "phastcons_bed_intersect_cols = {0:\"vrnt_chrom\", 1:\"vrnt_start\", 2:\"vrnt_end\", 3:\"vrnt_id\", 4:\"element_chrom\", \n",
    "                             5:\"element_start\", 6:\"element_end\", 7:\"element_id\", 8:\"overlap\"}\n",
    "sv_intersect_phastcons_elements_str = StringIO(str(vrnts_in_windows_bed_sorted.intersect(phastcons_elements, wo=True)))\n",
    "sv_intersect_phastcons_elements_df = pd.read_csv(sv_intersect_phastcons_elements_str, sep=\"\\t\", header=None).rename(columns=phastcons_bed_intersect_cols).astype({\"vrnt_id\":str})\n",
    "sv_intersect_phastcons_elements = sv_intersect_phastcons_elements_df.vrnt_id.unique()\n",
    "# gerp element overlap and count\n",
    "vrnt_phastcons_elements_df = all_vrnts_in_windows_df.copy()\n",
    "vrnt_phastcons_element_count_df = pd.DataFrame(sv_intersect_phastcons_elements_df.groupby(\"vrnt_id\", as_index=False).element_id.count()).rename(columns={\"element_id\":\"phastcons_element_count\"})\n",
    "vrnt_phastcons_elements_df = pd.merge(vrnt_phastcons_elements_df, \n",
    "                                 vrnt_phastcons_element_count_df, \n",
    "                                 on=\"vrnt_id\", \n",
    "                                 how=\"left\"\n",
    "                                ).fillna(0)\n",
    "\n",
    "vrnt_phastcons_elements_df[\"phastcons_element_overlap\"] = np.where(vrnt_phastcons_elements_df.vrnt_id.isin(sv_intersect_phastcons_elements), 1, 0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d8fb7722",
   "metadata": {},
   "source": [
    "### gwRVIS "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "074f1f2c",
   "metadata": {},
   "outputs": [],
   "source": [
    "vrnt_gwrvis_df = all_vrnts_in_windows_df.copy()\n",
    "gwrvis_dir_path = Path(gwrvis_dir)\n",
    "gwrvis_score_metrics_list = []\n",
    "for chrom in CHROMOSOMES[:22]: \n",
    "    gwrvis_score_path = gwrvis_dir_path.joinpath(f\"{chrom}_sv_in_windows_gwrvis.tsv\")\n",
    "    gwrvis_score_metrics_list.append(pd.read_csv(gwrvis_score_path, sep=\"\\t\"))\n",
    "vrnt_gwrvis_scored_full_df = pd.concat(gwrvis_score_metrics_list)\n",
    "vrnt_gwrvis_scored_df = vrnt_gwrvis_scored_full_df[[\"vrnt_id\", \"min\"]].rename(columns={\"min\":\"gwrvis_min\"})\n",
    "vrnt_gwrvis_df = pd.merge(vrnt_gwrvis_df, \n",
    "                          vrnt_gwrvis_scored_df, \n",
    "                          on=\"vrnt_id\",                          \n",
    "                          how=\"left\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "64c2b735-971a-407c-b1e9-9920772c66c7",
   "metadata": {},
   "source": [
    "### Conserved Enhancer elements \n",
    "\n",
    "* Too few elements overlapping across entire test set to use "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "6f9612db-9ac0-4e44-ba50-07844b9f6d6a",
   "metadata": {},
   "outputs": [],
   "source": [
    "fantom5_conserved_elements = BedTool(fantom_5enh_path)\n",
    "# intersect with bed file \n",
    "fantom5_conserved_bed_intersect_cols = {0:\"vrnt_chrom\", 1:\"vrnt_start\", 2:\"vrnt_end\", 3:\"vrnt_id\", 4:\"element_chrom\", \n",
    "                                        5:\"element_start\", 6:\"element_end\", 7:\"element_id\", 8:\"overlap\"}\n",
    "sv_intersect_f5_elements_str = StringIO(str(vrnts_in_windows_bed_sorted.intersect(fantom5_conserved_elements, wo=True)))\n",
    "sv_intersect_f5_elements_df = pd.read_csv(sv_intersect_f5_elements_str, sep=\"\\t\", header=None).rename(columns=fantom5_conserved_bed_intersect_cols).astype({\"vrnt_id\":str})\n",
    "sv_intersect_f5_elements = sv_intersect_f5_elements_df.vrnt_id.unique()\n",
    "# element overlap\n",
    "vrnt_f5_elements_df = all_vrnts_in_windows_df.copy()\n",
    "vrnt_f5_elements_df[\"f5_consv_element_overlap\"] = np.where(vrnt_f5_elements_df.vrnt_id.isin(sv_intersect_f5_elements), 1, 0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a68b19c4-f598-4fae-aee8-aaff8b9b2b6a",
   "metadata": {},
   "source": [
    "### Human accelerated regions (HARs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "bce5db97-9323-4f5b-bf59-8a984d7d39c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load HARs \n",
    "hars_bed = BedTool(hars_bed_path).sort()\n",
    "bed_intersect_cols_hars = {0:\"vrnt_chrom\", 1:\"vrnt_start\", 2:\"vrnt_end\", 3:\"vrnt_id\", 4:\"har_chrom\", \n",
    "                           5:\"har_start\", 6:\"har_end\", 7:\"har_name\", 8:\"overlap\"}\n",
    "# identify SVs that overlap HARs\n",
    "sv_intersect_hars_str = StringIO(str(vrnts_in_windows_bed_sorted.intersect(hars_bed, wo=True)))\n",
    "sv_intersect_hars_df = pd.read_csv(sv_intersect_hars_str, sep=\"\\t\", header=None).rename(columns=bed_intersect_cols_hars).astype({\"vrnt_id\":str})\n",
    "sv_intersect_hars = sv_intersect_hars_df.vrnt_id.unique()\n",
    "# annotate variants intersecting at least one HAR\n",
    "vrnt_har_intersect_df = all_vrnts_in_windows_df.copy()\n",
    "vrnt_har_intersect_df[\"intersect_har\"] = np.where(vrnt_har_intersect_df.vrnt_id.isin(sv_intersect_hars), 1, 0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4abbbbee-e890-40ca-b4ad-714422ebf2ea",
   "metadata": {},
   "source": [
    "### Combine Features "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "90e096a9-ee85-4de1-9c3c-247960299357",
   "metadata": {},
   "outputs": [],
   "source": [
    "# merge different features \n",
    "dfs_to_merge = [vrnt_gnomad_constraint_df, \n",
    "                vrnt_phylop_df, \n",
    "                vrnts_cadd_sv_df, \n",
    "                vrnt_gerp_elements_df, \n",
    "                vrnt_har_intersect_df, \n",
    "               vrnt_phastcons_elements_df, \n",
    "                vrnt_f5_elements_df, \n",
    "                vrnt_gwrvis_df, \n",
    "               ]\n",
    "vrnt_features_merged_df = reduce(lambda  left,right: pd.merge(left,right, on=all_vrnts_in_windows_df.columns.tolist(),\n",
    "                                                              how='inner'), dfs_to_merge)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "dae2deaa",
   "metadata": {},
   "outputs": [],
   "source": [
    "# structural variant information \n",
    "sv_info_df =pd.read_csv(sv_info_path, sep=\"\\t\", dtype={\"plinkID\": str}).rename(columns={\"plinkID\":\"vrnt_id\"})\n",
    "\n",
    "vrnt_features_merged_info_df = pd.merge(vrnt_features_merged_df, \n",
    "                                   sv_info_df, \n",
    "                                   on=\"vrnt_id\", \n",
    "                                   how=\"left\"\n",
    "                                  )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "26e9e59a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# add coding/non-coding information \n",
    "#sv_test_control_coding_info_path = wkdir_path.joinpath(\"5_misexp_vrnts/test_cntrl_sets/non_coding_svs/vrnt_id_in_windows_misexp_genes_coding_no_exon_1kb.tsv\")\n",
    "#sv_test_control_coding_info_df = pd.read_csv(sv_test_control_coding_info_path, sep=\"\\t\")\n",
    "#vrnt_features_merged_info_coding_df = pd.merge(vrnt_features_merged_info_df, sv_test_control_coding_info_df[[\"vrnt_id\", \"coding\"]], on=[\"vrnt_id\"], how=\"left\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "368164bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# write to file \n",
    "vrnt_features_out = out_dir.joinpath(\"vrnt_features_scores.csv\")\n",
    "vrnt_features_merged_info_df.to_csv(vrnt_features_out, index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b2ad042d-e9fd-40cc-8e21-968d03f01e3f",
   "metadata": {},
   "source": [
    "### Logistic regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "0ec4ee95-9dc9-415e-a974-8cd76a070028",
   "metadata": {
    "jupyter": {
     "outputs_hidden": true
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DEL gnomad_constraint_max_zscore\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.030201\n",
      "         Iterations 10\n",
      "DEL gnomad_max_constraint_grtr_z4\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.030150\n",
      "         Iterations 9\n",
      "DEL phylop_max\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.029485\n",
      "         Iterations 10\n",
      "DEL CADD_sv_raw_score\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.030165\n",
      "         Iterations 10\n",
      "DEL intersect_har\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.030431\n",
      "         Iterations 9\n",
      "DEL phastcons_element_count\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.030422\n",
      "         Iterations 9\n",
      "DEL gerp_element_count\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.030400\n",
      "         Iterations 9\n",
      "DEL gwrvis_min\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.030512\n",
      "         Iterations 9\n",
      "DUP gnomad_constraint_max_zscore\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.049609\n",
      "         Iterations 9\n",
      "DUP gnomad_max_constraint_grtr_z4\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.047628\n",
      "         Iterations 10\n",
      "DUP phylop_max\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.047695\n",
      "         Iterations 10\n",
      "DUP CADD_sv_raw_score\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.046811\n",
      "         Iterations 9\n",
      "DUP intersect_har\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.050124\n",
      "         Iterations 9\n",
      "DUP phastcons_element_count\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.049002\n",
      "         Iterations 9\n",
      "DUP gerp_element_count\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.049721\n",
      "         Iterations 9\n",
      "DUP gwrvis_min\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.047935\n",
      "         Iterations 9\n"
     ]
    }
   ],
   "source": [
    "feature_list = [\"gnomad_constraint_max_zscore\", \n",
    "                \"gnomad_max_constraint_grtr_z4\",\n",
    "                \"phylop_max\", \n",
    "                \"CADD_sv_raw_score\", \n",
    "                \"intersect_har\",\n",
    "                \"phastcons_element_count\", \n",
    "                \"gerp_element_count\", \n",
    "                \"gwrvis_min\",\n",
    "               ] \n",
    "\n",
    "region = \"all\"\n",
    "sv_types_logistic_regr_results = {}\n",
    "logr_count = 0          \n",
    "for sv_type in [\"DEL\", \"DUP\"]:\n",
    "    for feature in feature_list:\n",
    "        print(sv_type, feature)\n",
    "        # remove NaNs before normalising by length \n",
    "        input_df = vrnt_features_merged_info_df[(vrnt_features_merged_info_df.SVTYPE == sv_type) & \n",
    "                                                (vrnt_features_merged_info_df[feature].notna())].copy()\n",
    "        input_df[\"svlen_norm\"] = (input_df[\"SVLEN\"] - input_df[\"SVLEN\"].mean())/input_df[\"SVLEN\"].std()\n",
    "        input_df[f\"{feature}_norm\"] = (input_df[feature] - input_df[feature].mean())/input_df[feature].std()\n",
    "        y, X = dmatrices(f'misexp_uniq ~ {feature}_norm + svlen_norm', input_df, return_type = 'dataframe')\n",
    "        logit_fit = discrete_model.Logit(endog=y, exog=X).fit()\n",
    "        log_odds, pval = logit_fit.params[1], logit_fit.pvalues[1]\n",
    "        # normal approximation confidence intervals\n",
    "        lower_conf = logit_fit.conf_int(alpha=0.05)[0][1]\n",
    "        upper_conf = logit_fit.conf_int(alpha=0.05)[1][1]\n",
    "        sv_types_logistic_regr_results[logr_count] = [feature, sv_type, region, log_odds, lower_conf, upper_conf, pval]\n",
    "        logr_count += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "b5d6bd59-821e-4949-91d2-1e39232876bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "columns_logr_enrich = [\"feature\", \"sv_type\", \"region\", \"log_odds\", \"lower\", \"upper\", \"pval\"]\n",
    "sv_types_logistic_regr_results_df = pd.DataFrame.from_dict(sv_types_logistic_regr_results, orient=\"index\", columns=columns_logr_enrich)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "762403cb-2653-4983-aea3-732a4edcc2fb",
   "metadata": {},
   "source": [
    "### Multiple Testing Correction "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "f6c35499-9464-4a1c-bd63-77180d8ae4d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "pval_list = sv_types_logistic_regr_results_df.pval.to_numpy()\n",
    "# multiple testing correction\n",
    "reject, pvals_corrected, _, _ = multitest.multipletests(pval_list, alpha=0.05, method=\"fdr_bh\")\n",
    "sv_types_logistic_regr_results_df[\"pass_fdr_bh\"] = reject\n",
    "sv_types_logistic_regr_results_df[\"pvals_corrected_fdr_bh\"] = pvals_corrected\n",
    "# Bonferroni correction \n",
    "reject, pvals_corrected, _, _ = multitest.multipletests(pval_list, alpha=0.05, method=\"bonferroni\")\n",
    "sv_types_logistic_regr_results_df[\"pass_bonf\"] = reject\n",
    "sv_types_logistic_regr_results_df[\"pvals_corrected_bonf\"] = pvals_corrected"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7be175d1",
   "metadata": {},
   "source": [
    "### Significant results "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "372c67ba",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DEL phylop_max 0.4471033940033347 2.6303286921224432e-05\n",
      "DEL CADD_sv_raw_score 0.3324963545651643 0.015100119714700133\n",
      "DUP gnomad_constraint_max_zscore 0.9162837441442854 0.0037933639350685184\n",
      "DUP gnomad_max_constraint_grtr_z4 0.9473559926527871 0.0006432827739679205\n",
      "DUP phylop_max 1.104646071751581 0.027927586760368675\n",
      "DUP CADD_sv_raw_score 0.7566751604334726 0.001987429601630485\n",
      "DUP gwrvis_min -0.7881086546231187 0.030020598875784076\n"
     ]
    }
   ],
   "source": [
    "for index, row in sv_types_logistic_regr_results_df.iterrows(): \n",
    "    pass_bonf = row[\"pass_bonf\"]\n",
    "    if pass_bonf: \n",
    "        print(row[\"sv_type\"], row[\"feature\"], row[\"log_odds\"], row[\"pvals_corrected_bonf\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6c8f28e1",
   "metadata": {},
   "source": [
    "### Write to file "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "17552bb2",
   "metadata": {},
   "outputs": [],
   "source": [
    "### clean output \n",
    "features_to_keep = {#'gerp_element_count': \"Conserved elements\", \n",
    "                    'CADD_sv_raw_score': \"CADD-SV\",\n",
    "                    'phylop_max': \"Conservation\",\n",
    "                    #'gnomad_max_constraint_grtr_z4': \"gnomAD constraint z > 4\", \n",
    "                    \"gnomad_constraint_max_zscore\": \"gnomAD constraint\",\n",
    "                    'gwrvis_min': \"gwRVIS\",\n",
    "                    'intersect_har': \"HARs\"\n",
    "                   }\n",
    "sv_types_logistic_regr_results_features_to_keep_df = sv_types_logistic_regr_results_df[sv_types_logistic_regr_results_df.feature.isin(features_to_keep.keys())].copy()\n",
    "sv_types_logistic_regr_results_features_to_keep_df[\"feature_name\"] = sv_types_logistic_regr_results_features_to_keep_df.feature.replace(features_to_keep)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "76d7689b",
   "metadata": {},
   "outputs": [],
   "source": [
    "features_to_keep_path = out_dir.joinpath(\"misexp_sv_scores_features.tsv\")\n",
    "sv_types_logistic_regr_results_features_to_keep_df.to_csv(features_to_keep_path, sep=\"\\t\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0ad64f91",
   "metadata": {},
   "source": [
    "### Non length-adjusted enrichment "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "478e8db4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DEL gnomad_constraint_max_zscore\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.030361\n",
      "         Iterations 10\n",
      "DEL gnomad_max_constraint_grtr_z4\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.030293\n",
      "         Iterations 9\n",
      "DEL phylop_max\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.029516\n",
      "         Iterations 10\n",
      "DEL CADD_sv_raw_score\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.030233\n",
      "         Iterations 10\n",
      "DEL intersect_har\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.030519\n",
      "         Iterations 9\n",
      "DEL phastcons_element_count\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.030425\n",
      "         Iterations 9\n",
      "DEL gerp_element_count\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.030401\n",
      "         Iterations 9\n",
      "DEL gwrvis_min\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.030642\n",
      "         Iterations 9\n",
      "DUP gnomad_constraint_max_zscore\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.050113\n",
      "         Iterations 9\n",
      "DUP gnomad_max_constraint_grtr_z4\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.048009\n",
      "         Iterations 10\n",
      "DUP phylop_max\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.048141\n",
      "         Iterations 10\n",
      "DUP CADD_sv_raw_score\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.048595\n",
      "         Iterations 9\n",
      "DUP intersect_har\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.053113\n",
      "         Iterations 9\n",
      "DUP phastcons_element_count\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.049099\n",
      "         Iterations 9\n",
      "DUP gerp_element_count\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.049744\n",
      "         Iterations 9\n",
      "DUP gwrvis_min\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.048433\n",
      "         Iterations 9\n"
     ]
    }
   ],
   "source": [
    "sv_types_logistic_regr_results_no_len_adj = {}\n",
    "for sv_type in [\"DEL\", \"DUP\"]:\n",
    "    for feature in feature_list:\n",
    "        print(sv_type, feature)\n",
    "        # remove NaNs before normalising by length \n",
    "        input_df = vrnt_features_merged_info_df[(vrnt_features_merged_info_df.SVTYPE == sv_type) & \n",
    "                                                (vrnt_features_merged_info_df[feature].notna())].copy()\n",
    "        input_df[f\"{feature}_norm\"] = (input_df[feature] - input_df[feature].mean())/input_df[feature].std()\n",
    "        y, X = dmatrices(f'misexp_uniq ~ {feature}_norm', input_df, return_type = 'dataframe')\n",
    "        logit_fit = discrete_model.Logit(endog=y, exog=X).fit()\n",
    "        log_odds, pval = logit_fit.params[1], logit_fit.pvalues[1]\n",
    "        # normal approximation confidence intervals\n",
    "        lower_conf = logit_fit.conf_int(alpha=0.05)[0][1]\n",
    "        upper_conf = logit_fit.conf_int(alpha=0.05)[1][1]\n",
    "        sv_types_logistic_regr_results_no_len_adj[logr_count] = [feature, sv_type, region, log_odds, lower_conf, upper_conf, pval]\n",
    "        logr_count += 1\n",
    "sv_types_logistic_regr_results_no_len_adj_df = pd.DataFrame.from_dict(sv_types_logistic_regr_results_no_len_adj, orient=\"index\", columns=columns_logr_enrich)\n",
    "# \n",
    "pval_list = sv_types_logistic_regr_results_no_len_adj_df.pval.to_numpy()\n",
    "# multiple testing correction\n",
    "reject, pvals_corrected, _, _ = multitest.multipletests(pval_list, alpha=0.05, method=\"fdr_bh\")\n",
    "sv_types_logistic_regr_results_no_len_adj_df[\"pass_fdr_bh\"] = reject\n",
    "sv_types_logistic_regr_results_no_len_adj_df[\"pvals_corrected_fdr_bh\"] = pvals_corrected\n",
    "# Bonferroni correction \n",
    "reject, pvals_corrected, _, _ = multitest.multipletests(pval_list, alpha=0.05, method=\"bonferroni\")\n",
    "sv_types_logistic_regr_results_no_len_adj_df[\"pass_bonf\"] = reject\n",
    "sv_types_logistic_regr_results_no_len_adj_df[\"pvals_corrected_bonf\"] = pvals_corrected"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "d6ed73d9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DEL phylop_max 0.47652916239774784 1.0460493711318187e-06\n",
      "DEL CADD_sv_raw_score 0.3767640884019741 0.0014388999253719206\n",
      "DEL phastcons_element_count 0.13084673607783107 0.002278090365857613\n",
      "DEL gerp_element_count 0.1384361682458371 0.0009721632539310822\n",
      "DUP gnomad_constraint_max_zscore 1.0432872344083846 6.679166661325797e-05\n",
      "DUP gnomad_max_constraint_grtr_z4 1.025166559823696 3.943793444888182e-05\n",
      "DUP phylop_max 1.2456936227266526 0.0029161015656206635\n",
      "DUP CADD_sv_raw_score 0.7856179248653821 0.00025153269374883025\n",
      "DUP phastcons_element_count 0.4493134581338546 4.423355630459484e-05\n",
      "DUP gerp_element_count 0.4294918442204741 0.00028237353460778224\n",
      "DUP gwrvis_min -0.9254566335119306 0.00048781561514272756\n"
     ]
    }
   ],
   "source": [
    "for index, row in sv_types_logistic_regr_results_no_len_adj_df.iterrows(): \n",
    "    pass_bonf = row[\"pass_bonf\"]\n",
    "    if pass_bonf: \n",
    "        print(row[\"sv_type\"], row[\"feature\"], row[\"log_odds\"], row[\"pvals_corrected_bonf\"])\n",
    "        \n",
    "### clean output \n",
    "features_to_keep = {#'gerp_element_count': \"Conserved elements\", \n",
    "                    'CADD_sv_raw_score': \"CADD-SV\",\n",
    "                    'phylop_max': \"Conservation\",\n",
    "                    #'gnomad_max_constraint_grtr_z4': \"gnomAD constraint z > 4\", \n",
    "                    \"gnomad_constraint_max_zscore\": \"gnomAD constraint\",\n",
    "                    'gwrvis_min': \"gwRVIS\",\n",
    "                    'intersect_har': \"HARs\"\n",
    "                   }\n",
    "sv_types_logistic_regr_results_no_len_adj_features_to_keep_df = sv_types_logistic_regr_results_no_len_adj_df[sv_types_logistic_regr_results_no_len_adj_df.feature.isin(features_to_keep.keys())].copy()\n",
    "sv_types_logistic_regr_results_no_len_adj_features_to_keep_df[\"feature_name\"] = sv_types_logistic_regr_results_no_len_adj_df.feature.replace(features_to_keep)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "dc11d9df",
   "metadata": {},
   "outputs": [],
   "source": [
    "features_to_keep_no_len_adj_path = out_dir.joinpath(\"misexp_sv_scores_features_no_len_adj.tsv\")\n",
    "sv_types_logistic_regr_results_no_len_adj_features_to_keep_df.to_csv(features_to_keep_no_len_adj_path, sep=\"\\t\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98d1acd4",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
